Smoothing models

Get data - smooth disturbed and control plots together - AGB growth

Code
library(mgcv); library(data.table); library(gratia); library(marginaleffects); library(ggplot2); library(dplyr); library(tidyr); library(zoo); library(slider); library(here);library(broom)

#####test Paracou####

obs <- read.csv("G:/My Drive/cesab bioforest/Data/aggregated_data_v9.csv")

setDT(obs)

obs=obs[obs$Site=="Paracou"]


setnames(obs, "Plot", "subplot")

obs$plot=gsub("_.", "",obs$subplot )

obs <- obs[!Year%in%1984:1990]

obs <- dcast(
  obs,
  subplot + plot + Year ~ variable,
  value.var = "value"
)


##### upload data 
load("../clim_by_site.rda")

clim_all=rbindlist(lapply(clim_by_site[["Paracou"]], function(x) x$climate[[1]]),
                   idcol = "plot")

#"Paracou", "Mbaiki", "Corinto", "SUAS", "Lesong", "Ulu Muda", "Sungai Lalang", "Tene", "Jari"

baseline <- clim_all[year >= 1981 & year <= 2010]


clim_stats <- baseline[, .(
  mean_tmax = mean(tmax),
  sd_tmax   = sd(tmax),
  mean_vpd   = mean(vpd),
  sd_vpd     = sd(vpd),
  mean_srad  = mean(srad),
  sd_srad    = sd(srad),
  mean_def  = mean(def),
  sd_def    = sd(def)
), by = month]

clim_all <- merge(clim_all, clim_stats, by = "month", all.x = TRUE)

clim_all[, `:=`(
  z_tmax = (tmax - mean_tmax) / sd_tmax,
  z_vpd  = (vpd  - mean_vpd)  / sd_vpd,
  z_srad = (srad - mean_srad) / sd_srad,
  z_def  = (def  - mean_def)  / sd_def
)]

#climate anomalies follow inventories. For Paracou, mean per year before 1996, two years after

clim_all[, year_bin :=
           ifelse(year <= 1995,
                  year,                     # keep original year
                  1997 + ((year - 1996) %/% 2) * 2   # 2-year bins
           )]


census_anom <- clim_all[,.(
  mean_z_tmax = mean(z_tmax, na.rm = TRUE),
  mean_z_vpd = mean(z_vpd, na.rm = TRUE),
  mean_z_srad = mean(z_srad, na.rm = TRUE),
  mean_z_def = mean(z_def, na.rm = TRUE)
),
by = .(plot, year_bin)]

#AGB values before 1991 are not consistent
census_anom=census_anom[year_bin>=1991] 


merged <- merge(
  obs,
  census_anom,
  by.x = c("Year","plot"), by.y=c("year_bin","plot")
)


control=c(1,6,11,13,14,15)

merged$Treatment <- ifelse(merged$plot %in% control, "control", "disturbed")

merged$Treatment <- factor(merged$Treatment)
merged$plot <- factor(merged$plot)
merged$subplot <- factor(merged$subplot)

GAM k=5 and k=8

NAs for agb_growth - plots 13,14 and 15 miss data for 1991

Code
merged <- merged %>%
  group_by(subplot) %>%
  mutate(
    fitted_GAM5 = predict(
      gam(agb_growth ~ s(Year, k = 5), data = cur_data()),
      newdata = cur_data()
    )
  ) %>%
  ungroup()

p1.1 <- ggplot(merged, aes(Year, agb_growth - fitted_GAM5)) +
  geom_point() +
  ggtitle("GAM5 residuals")

p1.1

Code
ggplot(merged, aes(x = Year, y = fitted_GAM5, color = subplot)) +
  geom_line(size = 1) +  # predicted trajectory
  geom_point(data = merged, 
             aes(x = Year, y = agb_growth, color = subplot),
             inherit.aes = FALSE, size = 2) +  # observed points
  facet_wrap(~ subplot, scales = "free_y")+
  theme_bw() +
  labs(x = "Year", 
       y = "agb_growth", title="GAM-k=5") + theme(legend.position = "none")

GAM at edges - higher standard error, GAM uses many basis functions (Thin Plate Regression Splines (TPRS)) combined to estimate an unknown curve. Instead of minimizing just the residual error the model penalize large curvatures, which can be demonstrated with the standard errors. (for bs=fs, penalty is Shared among the subplots – it is supposed to control for overfitting) **

Code
#see SE at adges

m=gam(agb_growth ~ s(Year, k = 5), data = merged)
pred <- predict(m, newdata = merged, se.fit = TRUE)

setDT(merged)

merged[, se_GAM5 := pred$se.fit]


ggplot(merged, aes(Year, se_GAM5)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE, linewidth = 1.2) +
  labs(title = "Standard error of GAM smooth over Year",
       y = "Standard error") 

Error per subplot

Code
ggplot(merged, aes(Year, agb_growth-fitted_GAM5)) +
geom_point(alpha = 0.3) +
facet_wrap(~ subplot) +
geom_hline(yintercept = 0, linetype = 2) +
labs(y = "Residual",
 title = "Directional GAM k=5 errors by subplot")

Code
bias_by_subplot <- merged %>%
  group_by(subplot) %>%
  arrange(Year) %>%
  summarise(
    res = agb_growth - fitted_GAM5,
    mean_resid = mean(res, na.rm = TRUE),
    edge_early = mean(res[rank(Year) <= 5], na.rm = TRUE),
    edge_late  = mean(res[rank(Year, ties.method = "first", na.last = "keep") >
                           (n_distinct(Year) - 5)], na.rm = TRUE), 
      .groups= "drop"
  )

long_bias <- bias_by_subplot %>%
  pivot_longer(
    cols = c(mean_resid, edge_early, edge_late),
    names_to = "period",
    values_to = "residual"
  )
Code
ggplot(long_bias, aes(residual, subplot, color = period)) +
  geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
  geom_point(size = 3) +
  labs(
    x = "Mean residual (obs - pred)",
    y = "Subplot",
    title = "Directional GAM bias by subplot",
    color = "Period"
  ) +
  theme_minimal()

Code
#Residuals dont seem to display a systematic edge bias:no over-prediction at the beginning of the time series, no under-prediction at the end

GAM k=8

Code
merged <- merged %>%
  group_by(subplot) %>%
  mutate(
    fitted_GAM8 = predict(
      gam(agb_growth ~ s(Year, k = 8), data = cur_data()),
      newdata = cur_data()
    )
  )%>%
  ungroup()

p11.1 <- ggplot(merged, aes(Year, agb_growth - fitted_GAM8)) +
  geom_point() +
  ggtitle("GAM8 residuals")

 
p11.1

Code
p11=ggplot(merged, aes(x = Year, y = fitted_GAM8, color = subplot)) +
  geom_line(size = 1) +  # predicted trajectory
  geom_point(data = merged, 
             aes(x = Year, y = agb_growth, color = subplot),
             inherit.aes = FALSE, size = 2) +  # observed points
  facet_wrap(~ subplot, scales = "free_y")+
  
  theme_bw() +
  labs(x = "Year", 
       y = "agb_growth", title="GAM-k=8") + theme(legend.position = "none")

p11

Code
m=gam(agb_growth ~ s(Year, k = 8), data = merged)
pred <- predict(m, newdata = merged, se.fit = TRUE)

setDT(merged)

merged[, se_GAM8 := pred$se.fit]


ggplot(merged, aes(Year, se_GAM8)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE, linewidth = 1.2) +
  labs(title = "Standard error of GAM smooth over Year",
       y = "Standard error")

Error per subplot

Code
ggplot(merged, aes(Year, agb_growth-fitted_GAM8)) +
geom_point(alpha = 0.3) +
facet_wrap(~ subplot) +
geom_hline(yintercept = 0, linetype = 2) +
labs(y = "Residual",
 title = "Directional GAM errors by subplot")

Code
bias_by_subplot <- merged %>%
  group_by(subplot) %>%
  arrange(Year) %>%
  summarise(
    res = agb_growth - fitted_GAM8,
    mean_resid = mean(res, na.rm = TRUE),
    edge_early = mean(res[rank(Year) <= 5], na.rm = TRUE),
    edge_late  = mean(res[rank(Year, ties.method = "first", na.last = "keep") >
                           (n_distinct(Year) - 5)], na.rm = TRUE), 
      .groups= "drop"
  )



long_bias <- bias_by_subplot %>%
  pivot_longer(
    cols = c(mean_resid, edge_early, edge_late),
    names_to = "period",
    values_to = "residual"
  )
Code
ggplot(long_bias, aes(residual, subplot, color = period)) +
  geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
  geom_point(size = 3) +
  labs(
    x = "Mean residual (obs - pred)",
    y = "Subplot",
    title = "Directional GAM bias by subplot",
    color = "Period"
  ) +
  theme_minimal()

slider

complete=TRUE (reject incomplete windows, Per row: will keep 5 observations- in initial inventories = 5 years, after 1995 = 10 years..

Code
merged <- merged %>%
  group_by(subplot) %>%
  arrange(Year) %>%
  mutate(fitted_slide_complete = slide_dbl(agb_growth, mean, .before = 2, .after = 2, .complete=TRUE))%>%
  ungroup()



p3.1 <- ggplot(merged, aes(Year, agb_growth - fitted_slide_complete)) +
  geom_point() +
  ggtitle("slide residuals")

p3.1

Code
p3=ggplot(merged, aes(x = Year, y = fitted_slide_complete, color = subplot)) +
  geom_line(size = 1) +  # predicted trajectory
  geom_point(data = merged, 
             aes(x = Year, y = agb_growth, color = subplot),
             inherit.aes = FALSE, size = 2) +  # observed points
  facet_wrap(~subplot, scales = "free_y")+
  theme_bw() +
  labs(x = "Year", 
       y = "agb_growth", title="slide - .before = 2, .after = 2, window per observation")+ theme(legend.position = "none")

p3 

Error per subplot

Code
ggplot(merged, aes(Year, agb_growth-fitted_slide_complete)) +
geom_point(alpha = 0.3) +
facet_wrap(~ subplot) +
geom_hline(yintercept = 0, linetype = 2) +
labs(y = "Residual",
 title = "Directional slide errors by subplot")

Code
bias_by_subplot <- merged %>%
  group_by(subplot) %>%
  arrange(Year) %>%
  summarise(
    res = agb_growth - fitted_slide_complete,
    mean_resid = mean(res, na.rm = TRUE),
    edge_early = mean(res[rank(Year) <= 5], na.rm = TRUE),
    edge_late  = mean(res[rank(Year, ties.method = "first", na.last = "keep") >
                           (n_distinct(Year) - 5)], na.rm = TRUE), 
      .groups= "drop"
  )

long_bias <- bias_by_subplot %>%
  pivot_longer(
    cols = c(mean_resid, edge_early, edge_late),
    names_to = "period",
    values_to = "residual"
  )

**inherent of the method?

Code
ggplot(long_bias, aes(residual, subplot, color = period)) +
  geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
  geom_point(size = 3) +
  labs(
    x = "Mean residual (obs - pred)",
    y = "Subplot",
    title = "Directional slide bias by subplot",
    color = "Period"
  ) +
  theme_minimal()

Code
#Residuals display a systematic edge bias: over-prediction at the beginning of the time series and under-prediction at the end, #consistent with boundary effects in penalized smoothers.

Differences on window sizes among methods

Code
merged <- merged %>% arrange(subplot, Year)


merged <- merged %>%
  group_by(subplot) %>%
  mutate(
    # ----------------------------
    # 1) Sliding by ROWS
    # ----------------------------
    slide_row = slide_dbl(
      agb_growth,
      mean,
      .before = 2,
      .after  = 2,
      .complete = TRUE
    ),

    n_row = slide_int(
      agb_growth,
      ~ length(.x),
      .before = 2,
      .after  = 2,
      .complete = TRUE
    ),

    # ----------------------------
    # 2) Sliding by YEARS
    # ----------------------------
    slide_year = slide_index_dbl(
      .x = agb_growth,
      .i = Year,
      .f = mean,
      .before = 2,
      .after  = 2,
      .complete = TRUE
    ),

    n_year = slide_index_int(
      .x = agb_growth,
      .i = Year,
      .f = ~ length(.x),
      .before = 2,
      .after  = 2,
      .complete = TRUE
    ),
        # ----------------------------
    # 2) Sliding by YEARS complete=F
    # ----------------------------
    slide_year_false = slide_index_dbl(
      .x = agb_growth,
      .i = Year,
      .f = mean,
      .before = 2,
      .after  = 2,
      .complete = FALSE
    ),

    n_year_false = slide_index_int(
      .x = agb_growth,
      .i = Year,
      .f = ~ length(.x),
      .before = 2,
      .after  = 2,
      .complete = FALSE
    )
  ) %>%
  ungroup()

ggplot(
  filter(merged, subplot == "1_1"),
  aes(x = Year)
) +
  geom_line(aes(y = n_row), linewidth = 1) +
  geom_line(aes(y = n_year), linetype = "dashed", linewidth = 1) +
    geom_line(aes(y = n_year_false), color = "red") +
  labs(
    y = "Points per window",
    title = "ex.: Subplot 1_1",
    subtitle = "Solid = row sliding | Dashed = year sliding | Red = year sliding complete=F"
  )

Code
#plot(merged$slide_year_false, merged$slide_year)

complete=TRUE (reject incomplete windows Per year: will keep 5 observations- in initial inventories = 5 years, after 1995 = 5 years (but with INCOMPLETE windows)..

Code
merged <- merged %>%
  group_by(subplot) %>%
  arrange(Year) %>%
  mutate(fitted_slide_completeYear = slide_index_dbl(agb_growth, Year, mean, .before = 2, .after = 2, .complete=TRUE))%>%
  ungroup()

p33.1 <- ggplot(merged, aes(Year, agb_growth - fitted_slide_completeYear)) +
  geom_point() +
  ggtitle("slide residuals")

p33.1

Code
p33=ggplot(merged, aes(x = Year, y = fitted_slide_completeYear, color = subplot)) +
  geom_line(size = 1) +  # predicted trajectory
  geom_point(data = merged, 
             aes(x = Year, y = agb_growth, color = subplot),
             inherit.aes = FALSE, size = 2) +  # observed points
  facet_wrap(~subplot, scales= "free_y")+
  theme_bw() +
  labs(x = "Year", 
       y = "agb_growth", title="slide - .before = 2, .after = 2, window per year")+ theme(legend.position = "none")

p33 

Error per subplot

Code
ggplot(merged, aes(Year, agb_growth-fitted_slide_completeYear)) +
geom_point(alpha = 0.3) +
facet_wrap(~ subplot) +
geom_hline(yintercept = 0, linetype = 2) +
labs(y = "Residual",
 title = "Directional slide errors by subplot")

complete=FALSE, per Year no NAs, use smaller windows at edges

Code
merged <- merged %>%
  group_by(subplot) %>%
  arrange(Year) %>%
  mutate(fitted_slide = slide_dbl(agb_growth, mean, .before = 2, .after = 2, .complete=TRUE))%>%
  ungroup()

p333.1 <- ggplot(merged, aes(Year, agb_growth - fitted_slide)) +
  geom_point() +
  ggtitle("slide residuals")


p333.1

Code
p333=ggplot(merged, aes(x = Year, y = fitted_slide, color = subplot)) +
  geom_line(size = 1) +  # predicted trajectory
  geom_point(data = merged, 
             aes(x = Year, y = agb_growth, color = subplot),
             inherit.aes = FALSE, size = 2) +  # observed points
  facet_wrap(~subplot, scales = "free_y")+
  theme_bw() +
  labs(x = "Year", 
       y = "agb_growth", title="slide - .before = 2, .after = 2 - partial windows at edges")+ theme(legend.position = "none")

p333 

Per year is more wiggly, more bias:

Code
ggplot(
  merged,
  aes(x = Year)
) +
  geom_line(aes(y = slide_row), linewidth = 1) +
  geom_line(aes(y = slide_year), linetype = "dashed", linewidth = 1) +
    geom_line(aes(y = slide_year_false), color = "red")+ 
  facet_wrap(~subplot, scales = "free_y")+
  labs(
    y = "Points per window",
    title = "Solid = row sliding | Dashed = year sliding | Red = year sliding complete=F"
  )

Most opposed models, GAM k=5 (more smooth), slide per year (more wiggly)

Code
ggplot(
  merged,
  aes(x = Year)
) +
  geom_line(aes(y = fitted_GAM5), linewidth = 1) +
      geom_line(aes(y = fitted_GAM8), color = "green")+ 
    geom_line(aes(y = slide_year), color = "red")+ 
      geom_line(aes(y = slide_row), color = "orange")+ 

  facet_wrap(~subplot, scales = "free_y")+
  labs(
    y = "Points per window",
    title = "Black = fitted_GAM5 |  Green = fitted_GAM8 | Red = year sliding | Orange = row sliding"
  )

root mean square error

Code
rmse_results <- merged %>%
  group_by(subplot) %>%
  summarise(
    RMSE_GAM8   = sqrt(mean((agb_growth - fitted_GAM8)^2, na.rm = TRUE)),
    RMSE_GAM5   = sqrt(mean((agb_growth - fitted_GAM5)^2, na.rm = TRUE)),
    RMSE_SLIDE  = sqrt(mean((agb_growth - fitted_slide)^2, na.rm = TRUE)),
    RMSE_SLIDE_COMPLETE  = sqrt(mean((agb_growth - fitted_slide_complete)^2, na.rm = TRUE)),
    RMSE_SLIDE_COMPLETE_YEAR  = sqrt(mean((agb_growth - fitted_slide_completeYear)^2, na.rm = TRUE)),
  ) %>% 
  ungroup()


rmse_long <- rmse_results %>%
  pivot_longer(
    cols = starts_with("RMSE"),
    names_to = "Model",
    values_to = "RMSE"
  )

rmse_long <- rmse_long %>%
  mutate(Model = reorder(Model, RMSE))

ggplot(rmse_long, aes(x = Model, y = RMSE)) +
  geom_boxplot() +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10)
  )

lm

Code
vars <- grep("^fitted_", names(merged), value = TRUE)

results <- data.frame()

for (v in vars) {
  

  mod <- lm(as.formula(paste0("agb_growth - ", v, " ~ Year")), data = merged)
  tmp <- tidy(mod, conf.int = TRUE) %>%  # IC95%
    mutate(method = v)
  
  results <- bind_rows(results, tmp)
}

effects <- results %>%
  filter(term == "Year")

ggplot(effects,
       aes(x = estimate,
           y = reorder(method, estimate))) +
  
  geom_point(size = 2) +
  geom_errorbarh(aes(xmin = conf.low,
                     xmax = conf.high),
                 height = 0.2) +
  
  geom_vline(xintercept = 0,
             linetype = "dashed") +
  
  labs(x = "Effet de Year (slope ± IC95%)",
       y = "Méthode",
       title = "Year effect") +
  
  theme_minimal()

temporal signal on residuals

Code
methods <- c(
  "fitted_GAM5",
  "fitted_GAM8",
  "fitted_slide",
  "fitted_slide_completeYear",
  "fitted_slide_complete"
)

for (m in methods) {
  merged[[paste0("resid_", m)]] <- merged$agb_growth - merged[[m]]
}

pairwise_corr <- function(df, col) {
  wide <- tidyr::pivot_wider(df[, c("subplot","Year", col)],
                             names_from = subplot, values_from = all_of(col))
  M <- as.matrix(wide[, -1])          # remove Year
  C <- cor(M, use = "pairwise.complete.obs")
  mean(C[upper.tri(C)], na.rm = TRUE)
}

temporal_signal <- data.frame(
  method = methods,
  mean_pairwise_corr = sapply(paste0("resid_", methods),
                              function(x) pairwise_corr(merged, x))
)


ggplot(temporal_signal, aes(x = method, y = mean_pairwise_corr)) +
  geom_boxplot() +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10)
  )

Testing full models

**GAM k=5 and slide_complete per Year

Code
##the right GAM model:

merged <- merged %>%
  mutate(
    fitted_GAM5_full = predict(
      gam(agb_growth ~ subplot +s(Year,by = subplot, k = 5), data = cur_data()),
      newdata = cur_data()
    )
  ) 

plot(merged$fitted_GAM5, merged$fitted_GAM5_full)

Code
mod_gam <- gam(
  agb_growth ~ 
    subplot +
    s(Year, by = subplot, k = 5) +
    mean_z_tmax:Treatment,
  data = merged,
  method = "REML"
)

#gam.check(mod_gam)

#summary(mod_gam)



mod_lm_gam <- lm(
  agb_growth-fitted_GAM5 ~ mean_z_tmax:Treatment,
  data = merged,
  method = "REML"
)

#summary(mod_lm_gam)


mod_lm_slide <- lm(
  agb_growth-fitted_slide_completeYear ~ mean_z_tmax:Treatment,
  data = merged,
  method = "REML"
)

#summary(mod_lm_slide)

term_pattern <- "^mean_z_tmax:Treatment"

table_results <- bind_rows(
  
  # ---- GAM ----
  broom.mixed::tidy(mod_gam, parametric = TRUE) %>%
    filter(grepl(term_pattern, term)) %>%
    mutate(
      Model = "Full GAM",
      Fit_stat = paste0("Dev expl = ",
                        round(summary(mod_gam)$dev.expl * 100, 1), "%")
    ),
  
  # ---- LM on GAM residuals ----
  tidy(mod_lm_gam, conf.int = TRUE) %>%
    filter(grepl(term_pattern, term)) %>%
    mutate(
      Model = "LM on GAM residuals",
      Fit_stat = paste0("Adj R2 = ",
                        round(summary(mod_lm_gam)$adj.r.squared, 3))
    ),
  
  # ---- LM on SLIDE residuals ----
  tidy(mod_lm_slide, conf.int = TRUE) %>%
    filter(grepl(term_pattern, term)) %>%
    mutate(
      Model = "LM on SLIDE residuals",
      Fit_stat = paste0("Adj R2 = ",
                        round(summary(mod_lm_slide)$adj.r.squared, 3))
    )
)

final_table <- table_results %>%
  transmute(
    Model,
    Term = term,
    Estimate = round(estimate, 3),
    SE       = round(std.error, 3),
    p_value  = format.pval(p.value, digits = 3),
    Fit_stat
  )

final_table
# A tibble: 6 × 6
  Model                 Term                     Estimate    SE p_value Fit_stat
  <chr>                 <chr>                       <dbl> <dbl> <chr>   <chr>   
1 Full GAM              mean_z_tmax:Treatmentco…   -0.115 0.089 0.19851 Dev exp…
2 Full GAM              mean_z_tmax:Treatmentdi…   -0.155 0.072 0.03294 Dev exp…
3 LM on GAM residuals   mean_z_tmax:Treatmentco…   -0.057 0.057 0.31885 Adj R2 …
4 LM on GAM residuals   mean_z_tmax:Treatmentdi…   -0.078 0.047 0.09788 Adj R2 …
5 LM on SLIDE residuals mean_z_tmax:Treatmentco…   -0.205 0.071 0.00385 Adj R2 …
6 LM on SLIDE residuals mean_z_tmax:Treatmentdi…   -0.187 0.059 0.00157 Adj R2 …